Salaries Data import and cleaning

Descrition of data cleaning:

  • reading in data

  • selecting and renaming the columns

  • since names are in last, first format, reorder them to be first last

  • make sure all “Jr”s go at the end

  • remove accents using stringi::stri_trans_general

salaries = read_excel(
  "data/MLB-Salaries 2000-24.xlsx", 
  sheet = "2022.xls",
  skip = 1) |>
  select(1:4) |>
  rename(position = "Pos'n",
         salary_2022 = "2022.0",
         service_time_yrs = "MLS",
         name = "Player") %>%
  mutate(name = str_split(name, ","),
         name = map(name, rev),       
         name = map_chr(name, str_c, collapse = " "),
         name = str_trim(name),
         name = str_replace_all(name, "[*#\\.,]", ""),
         junior = str_detect(name, "Jr"),
         name = if_else(junior == TRUE, str_remove(name, " Jr"), name),
         name = if_else(junior == TRUE, str_c(name, " Jr"), name),
         name = stringi::stri_trans_general(name,id = "Latin-ASCII")) |>
  select(-junior) %>%
  mutate(simple_position = str_split_i(position, "-", 1),
         simple_position = fct_relevel(simple_position,
                                       c("c", "1b", "2b", "3b", "ss",
                                         "lf", "cf", "rf", "inf", "of",
                                         "dh", "rhp", "lhp")),
         service_time_floor = floor(service_time_yrs))
## New names:
## • `` -> `...5`

Batting and Pitching data import and cleaning

Batting

Descrition of data cleaning:

  • reading in data and clean names

  • grouping by name and using a count to ensure we only end up with one of each name (slightly complicated due to interleague trades and potential for players with the same name)

  • remove whitespace, from names and use stringi again to remove accents so that the formatting exactly matches the salaries tibble

  • Filter out those with fewer than 100 plate appearances

batting = read_delim("data/2022 MLB Player Stats - Batting.csv", delim = ";",
                     locale = locale(encoding = "latin1")) |>
  janitor::clean_names() %>%
  group_by(name) |>
  mutate(name_count = n(),
         keep_row = case_when(name_count == 1 ~ TRUE,
                              name_count > 1 & tm == "TOT" ~ TRUE,
                              .default = FALSE)) |>
  filter(keep_row == TRUE) %>%
  mutate(name_count = n(),
         keep_row = case_when(name_count == 1 ~ TRUE,
                              name_count > 1 & lg == "MLB" ~ TRUE,
                              .default = FALSE)) |>
  ungroup() |>
  filter(keep_row == TRUE) %>%
  mutate(name = str_split(name, "\\s+"),
         name = map_chr(name, str_c, collapse = " "),
         name = str_trim(name),
         name = str_replace_all(name, "[*#\\.]", ""),
         name = stringi::stri_trans_general(name,id = "Latin-ASCII"))
## Rows: 992 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr  (3): Name, Tm, Lg
## dbl (26): Rk, Age, G, PA, AB, R, H, 2B, 3B, HR, RBI, SB, CS, BB, SO, BA, OBP...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
merged_batting <- salaries |>
  mutate(simple_position = str_split_i(position, "-", 1)) |>
  filter(!simple_position %in% c("rhp","lhp")) |>
  inner_join(batting, by = "name") |>
  filter(pa >= 100)

salary_not_in_batting <- anti_join(salaries, batting, by = "name")
batting_not_in_salary <- anti_join(batting, salaries, by = "name")

Pitching

(same data cleaning process as above)

pitching = read_delim("data/2022 MLB Player Stats - Pitching.csv", delim = ";",
                     locale = locale(encoding = "latin1")) |>
  janitor::clean_names() %>%
  group_by(name) |>
  mutate(name_count = n(),
         keep_row = case_when(name_count == 1 ~ TRUE,
                              name_count > 1 & tm == "TOT" ~ TRUE,
                              .default = FALSE)) |>
  filter(keep_row == TRUE) %>%
  mutate(name_count = n(),
         keep_row = case_when(name_count == 1 ~ TRUE,
                              name_count > 1 & lg == "MLB" ~ TRUE,
                              .default = FALSE)) |>
  ungroup() |>
  filter(keep_row == TRUE) %>%
  mutate(name = str_split(name, "\\s+"),
         name = map_chr(name, str_c, collapse = " "),
         name = str_trim(name),
         name = str_replace_all(name, "[*#\\.]", ""),
         name = stringi::stri_trans_general(name,id = "Latin-ASCII"))
## Rows: 1081 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr  (3): Name, Tm, Lg
## dbl (32): Rk, Age, W, L, W-L%, ERA, G, GS, GF, CG, SHO, SV, IP, H, R, ER, HR...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
merged_pitching <- inner_join(salaries, pitching, by = "name") |>
  filter(ip >= 20) |>
  separate(ip, into = c("ip", "ip_dec"), remove = FALSE, convert = TRUE) |>
  mutate(ip_dec_333 = ifelse(is.na(ip_dec), 0, ip_dec * 333)  ,
         ip_total = paste(ip, ip_dec_333, sep = "."),
         ip_total = as.numeric(ip_total)) |>
  select(-ip, -ip_dec, -ip_dec_333) |>
  separate(position, into = c("hand", "pitcher_type")) |>
  mutate(pitcher_type = ifelse(is.na(pitcher_type), "r", pitcher_type)) |>
  filter(hand == "lhp" | hand == "rhp")
## Warning in inner_join(salaries, pitching, by = "name"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 325 of `x` matches multiple rows in `y`.
## ℹ Row 265 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 172 rows [5, 6, 10, 12,
## 14, 22, 24, 27, 30, 32, 33, 34, 36, 38, 40, 41, 42, 43, 45, 47, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 336 rows [24, 31, 34, 40,
## 44, 46, 49, 53, 55, 58, 59, 63, 65, 68, 70, 72, 77, 79, 81, 82, ...].
salary_not_in_pitching <- anti_join(salaries, pitching, by = "name")
pitching_not_in_salary <- anti_join(pitching, salaries, by = "name")

Note that there are a number of names in salaries not found in batting or pitching, and vice versa. Some of this may be genuine missingness (e.g. salaries has fewer rows than pitching and batting combined), but some is also due to alternative spellings of names in the datasets. Possibly could be fixed using other matching methods, but the analysis pipeline will still work from here.

Salaries EDA

Some Basic EDA for salaries on their own

Service time (i.e. number of years of experience)

salaries |>
  count(service_time_floor) |>
  ggplot(aes(x = service_time_floor, y = n)) + 
  geom_col() + 
  labs(title = "Number of Players per Year of Experience",
       x = "Years in the MLB",
       y = "Number of Players") +
  scale_x_continuous(breaks = seq(0,21,by=1)) +
  theme_bw()

This is right skewed, so may need to transform it if used in regression

Salary

salaries |>
  ggplot(aes(x = salary_2022)) + 
  geom_density() + 
  labs(title = "2022 Salary Distribution",
       x = "2022 Salary",
       y = "Density") +
  scale_x_continuous(labels = label_comma()) +
  theme_bw()
## Warning: Removed 295 rows containing non-finite outside the scale range
## (`stat_density()`).

salaries |>
  mutate(position_group = case_match(simple_position,
                                     c("rhp","lhp") ~ "pitcher",
                                     c("cf","lf","rf","of") ~ "outfield",
                                     "dh" ~ "dh",
                                     .default = "infield")) |>
  ggplot(aes(x = salary_2022, color = position_group)) +
  geom_density() +
  labs(title = "Salary by Position Group",
       x = "2022 Salary",
       y = "Density") +
  scale_x_continuous(labels = label_comma()) +
  scale_color_discrete(name = "Position Group",
                       labels = c("Designated Hitter",
                                "Infield",
                                "Outfield",
                                "Pitcher")) +
  theme_bw()
## Warning: Removed 295 rows containing non-finite outside the scale range
## (`stat_density()`).

This is also heavily right skewed, with many players having salary under $1,000,000

Position type table

merged_batting |>
  count(simple_position) |> 
  mutate(simple_position = fct_infreq(simple_position,w = n)) |>
  ggplot(aes(x = simple_position, y = n)) +
  geom_col()

Salaries by Position Boxplots

salaries %>%
  ggplot(aes(x = simple_position, y = salary_2022)) +
  geom_boxplot()
## Warning: Removed 295 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Infielders vs. outfielders vs. pitchers?

salaries |>
  mutate(position_group = case_match(simple_position,
                                     c("rhp","lhp") ~ "pitcher",
                                     c("cf","lf","rf","of") ~ "outfield",
                                     "dh" ~ "dh",
                                     .default = "infield")) |>
  ggplot(aes(x = position_group, y = salary_2022)) +
  geom_boxplot()
## Warning: Removed 295 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Salaries by Experience

salaries %>%
  ggplot(aes(x = factor(service_time_floor), y = salary_2022)) +
  geom_boxplot() 
## Warning: Removed 295 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

salaries %>%
  group_by(service_time_floor) %>%
  summarize(avg_salary = mean(salary_2022, na.rm = TRUE)) %>%
  ggplot(aes(x = service_time_floor, y = avg_salary)) +
  geom_col() 

salaries |>
  ggplot(aes(x = service_time_floor, y = salary_2022)) + 
  geom_point(alpha = 0.5) 
## Warning: Removed 295 rows containing missing values or values outside the scale range
## (`geom_point()`).

Positions by Experience

salaries %>%
  ggplot(aes(x=floor(service_time_yrs),fill = simple_position)) +
  geom_bar(position = "fill") 

salaries |>
  ggplot(aes(x = service_time_floor, 
             y = simple_position, 
             fill = simple_position)) + 
  ggridges::geom_density_ridges(alpha = 0.5) 
## Picking joint bandwidth of 1.21

Salary Distribution by Experience

salaries %>%
  ggplot(aes(x=factor(floor(service_time_yrs)), y=salary_2022)) +
  geom_boxplot()
## Warning: Removed 295 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Batting EDA

Distributions of variables

Looking at some hitting statistics

merged_batting |> 
  ggplot(aes(x = hr)) + 
  geom_histogram() +
  labs(x = "Number of home runs (HRs)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

merged_batting |> 
  ggplot(aes(x = rbi)) + 
  geom_histogram() +
  labs(x = "Runs batted in (RBI)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

These are also left skewed so will need to normalize for any linear model.

OPS is an “all in one” statistic combining on-base percentage (OBP) and slugging (SLG).

OBP is calculated as \(\frac{Hits (H) + Walks (BB) + Hit by pitch (HBP)}{At \ bats (AB) + Walks (BB) + sacrifice \ flies (SF) + Hit by pitch (HBP)}\)

Slugging is calculated as \(\frac{total \ bases (TB)}{At \ bats (AB)}\)

OPS is the sum of these two statistics.

OPS+ (or adjusted OPS) is adjusted for the park and league averages.

merged_batting |> 
  ggplot(aes(x = ops)) + 
  geom_histogram() +
  labs(x = "OPS")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

merged_batting |> 
  ggplot(aes(x = ops_2)) + 
  geom_histogram() +
  labs(x = "OPS+")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

One idea might be to compare OPS or OPS+ to more traditional statistics, like RBIs, HRs, or batting averages to OPS or OPS+ as single predictors of salary.

One other factor to consider is the correlation between some of these variables

merged_batting |>
  select(hr, rbi, pa, ba, ops) |>
  GGally::ggpairs()
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

Some of these potential predictors are fairly strongly correlated (like RBI and HR, or BA and OPS), so it’s important not to include to many collinear variables in a potential linear model.

Relationship to salary

Looking at some relationships to salary

Position vs. salary

merged_batting |>
  mutate(positions = str_split_i(position, "-",1))  |>
  ggplot(aes(x = positions,y= salary_2022)) +
  geom_boxplot()
## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

merged_batting |>
  mutate(positions = str_split_i(position, "-",1))  |>
  group_by(positions) |>
  summarize(avg_salary = mean(salary_2022, na.rm = TRUE),
            median_salary = median(salary_2022, na.rm = TRUE))
## # A tibble: 11 × 3
##    positions avg_salary median_salary
##    <chr>          <dbl>         <dbl>
##  1 1b          8845755.       6225000
##  2 2b          5010094.       2550000
##  3 3b          8302384.       3375000
##  4 c           3669983.       1750000
##  5 cf          8145270.       5500000
##  6 dh         11857143.      12000000
##  7 inf         1312500        1312500
##  8 lf          6971404.       6125000
##  9 of          2211196.        746100
## 10 rf          7833411.       4875000
## 11 ss          7266837        4000000

HR vs. salary

merged_batting |>
  ggplot(aes(x = hr, y = salary_2022)) + 
  geom_point()
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).

RBI vs. salary

merged_batting |>
  ggplot(aes(x = rbi, y = salary_2022)) + 
  geom_point()
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).

BA vs. salary

merged_batting |>
  ggplot(aes(x = ba, y = salary_2022)) + 
  geom_point()
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).

Plate appearances vs. salary

merged_batting |>
  ggplot(aes(x = pa, y = salary_2022)) + 
  geom_point()
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).

OPS+ vs salary

merged_batting |>
  ggplot(aes(x = ops_2, y = salary_2022)) + 
  geom_point()
## Warning: Removed 44 rows containing missing values or values outside the scale range
## (`geom_point()`).

Since salary (and other variables) are very skewed, we may want to transform these data using a log transformation.

Regression models

Home runs

hr_fit <- lm(log(salary_2022,10) ~ hr,
             data = merged_batting)

hr_fit |>
  broom::tidy()
## # A tibble: 2 × 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)   6.22     0.0445     140.   2.09e-310
## 2 hr            0.0206   0.00291      7.08 7.87e- 12

OPS+

ops_fit <- lm(log(salary_2022,10) ~ ops_2,
             data = merged_batting)

ops_fit |>
  broom::tidy()
## # A tibble: 2 × 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)  5.96      0.100       59.4  1.38e-185
## 2 ops_2        0.00531   0.00100      5.28 2.22e-  7

Would also want to account for years of experience in predicting salary

experience_fit <- lm(log(salary_2022,10) ~ service_time_floor,
                     data = merged_batting)

experience_fit |>
  broom::tidy()
## # A tibble: 2 × 5
##   term               estimate std.error statistic  p.value
##   <chr>                 <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)           5.98    0.0312      192.  0       
## 2 service_time_floor    0.115   0.00562      20.5 6.59e-62

A multivariable model including all of these

multivar_fit <- lm(log(salary_2022,10) ~ hr + ops_2 + service_time_floor,
                   data = merged_batting)

multivar_fit |>
  broom::tidy()
## # A tibble: 4 × 5
##   term               estimate std.error statistic   p.value
##   <chr>                 <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)         5.69     0.0759       75.0  1.13e-217
## 2 hr                  0.0115   0.00285       4.04 6.53e-  5
## 3 ops_2               0.00176  0.000948      1.85 6.47e-  2
## 4 service_time_floor  0.110    0.00522      21.1  2.06e- 64

Pitching EDA

In order to eliminate pitchers with very few appearances or position players (catchers, first basemen, etc.) who came in to pitch in blowouts, I filtered only for pitchers with at least 10 innings pitches.

Distribution of Innings Pitched

merged_pitching |> 
  ggplot(aes(x = ip_total)) + 
  geom_histogram(binwidth = 2, fill = "lightblue", col = "black")

Distribution of ERA+

merged_pitching |>
  ggplot(aes(x = era_2)) + 
  geom_histogram(fill = "lightblue", col = "black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

IP vs. Salary Plot

merged_pitching |> 
  ggplot(aes(x = ip_total, y = salary_2022)) + 
  geom_point() + 
  scale_x_continuous(lim = c(0, 400))
## Warning: Removed 50 rows containing missing values or values outside the scale range
## (`geom_point()`).

ERA+ vs. Salary Plot

merged_pitching |> 
  ggplot(aes(x = era_2, y = salary_2022)) + 
  geom_point() +
  scale_x_continuous(lim = c(0, 400))
## Warning: Removed 51 rows containing missing values or values outside the scale range
## (`geom_point()`).

IP vs. Log Salary Plot

merged_pitching |> 
  ggplot(aes(x = ip_total, y = log(salary_2022))) + 
  geom_point() + 
  scale_x_continuous(lim = c(0, 400))
## Warning: Removed 50 rows containing missing values or values outside the scale range
## (`geom_point()`).

ERA+ vs. Log Salary Plot

merged_pitching |> 
  ggplot(aes(x = era_2, y = log(salary_2022))) + 
  geom_point() + 
  scale_x_continuous(lim = c(0, 400))
## Warning: Removed 51 rows containing missing values or values outside the scale range
## (`geom_point()`).

LMs

lm_ip = lm(salary_2022 ~ ip_total, data = merged_pitching)
summary(lm_ip)
## 
## Call:
## lm(formula = salary_2022 ~ ip_total, data = merged_pitching)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -7881584 -2346730 -1339157   542534 36868673 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   453710     505732   0.897     0.37    
## ip_total       41360       5280   7.833    4e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5394000 on 417 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  0.1283, Adjusted R-squared:  0.1262 
## F-statistic: 61.35 on 1 and 417 DF,  p-value: 4.002e-14
lm_ip_data = tibble(fitted.values(lm_ip), residuals(lm_ip))

names(lm_ip_data) = c("fitted", "resid")

lm_ip_data |>
  ggplot(aes(x = fitted, y = resid)) +
  geom_point()

view(lm_ip_data)
lm_era_2 = lm(salary_2022 ~ era_2, data = merged_pitching)
summary(lm_era_2)
## 
## Call:
## lm(formula = salary_2022 ~ era_2, data = merged_pitching)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -4017039 -3088950 -2721275   385077 39372391 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3560272     674783   5.276 2.12e-07 ***
## era_2           2371       5297   0.448    0.655    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5776000 on 417 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  0.0004801,  Adjusted R-squared:  -0.001917 
## F-statistic: 0.2003 on 1 and 417 DF,  p-value: 0.6547
lm_era_2_data = tibble(fitted.values(lm_era_2), residuals(lm_era_2))

names(lm_era_2_data) = c("fitted", "resid")

lm_era_2_data |>
  ggplot(aes(x = fitted, y = resid)) +
  geom_point()

view(lm_era_2_data)

Log-Transformed LMs

lm_ip_log = lm(log(salary_2022) ~ ip_total, data = merged_pitching)
summary(lm_ip_log)
## 
## Call:
## lm(formula = log(salary_2022) ~ ip_total, data = merged_pitching)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8882 -0.7565 -0.4034  0.7288  3.1811 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.776177   0.097656 141.069  < 2e-16 ***
## ip_total     0.008032   0.001020   7.877 2.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.042 on 417 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  0.1295, Adjusted R-squared:  0.1274 
## F-statistic: 62.05 on 1 and 417 DF,  p-value: 2.933e-14
lm_ip_log_data = tibble(fitted.values(lm_ip_log), residuals(lm_ip_log))

names(lm_ip_log_data) = c("fitted", "resid")

lm_ip_log_data |>
  ggplot(aes(x = fitted, y = resid)) +
  geom_point()

view(lm_ip_log_data)
lm_era_2_log = lm(log(salary_2022) ~ era_2, data = merged_pitching)
summary(lm_era_2_log)
## 
## Call:
## lm(formula = log(salary_2022) ~ era_2, data = merged_pitching)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9979 -0.9559 -0.5614  0.8333  3.1417 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.441e+01  1.304e-01 110.497   <2e-16 ***
## era_2       1.869e-04  1.024e-03   0.183    0.855    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.116 on 417 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  7.994e-05,  Adjusted R-squared:  -0.002318 
## F-statistic: 0.03334 on 1 and 417 DF,  p-value: 0.8552
lm_era_2_data_log = tibble(fitted.values(lm_era_2_log), residuals(lm_era_2_log))

names(lm_era_2_data_log) = c("fitted", "resid")

lm_era_2_data_log |>
  ggplot(aes(x = fitted, y = resid)) +
  geom_point()

Final Pitching LM

lm_pitching = lm(salary_2022 ~ hand + pitcher_type + hand*pitcher_type + ip_total + era_2,
                 data = merged_pitching)

summary(lm_pitching)
## 
## Call:
## lm(formula = salary_2022 ~ hand + pitcher_type + hand * pitcher_type + 
##     ip_total + era_2, data = merged_pitching)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -9175137 -1716370  -699492   602203 35065484 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)            15005127    4956120   3.028  0.00262 **
## handrhp                -8606379    5351662  -1.608  0.10857   
## pitcher_typer         -14812000    4970647  -2.980  0.00305 **
## pitcher_types          -8591542    5028351  -1.709  0.08828 . 
## ip_total                  16873       5758   2.930  0.00357 **
## era_2                      4389       4639   0.946  0.34466   
## handrhp:pitcher_typer   8656223    5386994   1.607  0.10885   
## handrhp:pitcher_types   7266736    5438133   1.336  0.18221   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4937000 on 411 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  0.2803, Adjusted R-squared:  0.268 
## F-statistic: 22.87 on 7 and 411 DF,  p-value: < 2.2e-16

From our model, we can see that relief pitchers have significantly lower salaries than closing pitchers, which makes sense, since closing pitchers have very strong reputations as some of the most effective pitchers in the game and therefore will draw high salaries. Additionally, the era_2 predictor, even when accounting for pitcher handedness, pitcher type, and the number of innings pitched, has an insignificant \(P\)-value, implying that there is no significant association between one of the most highly regarded summative statistics for pitching effectiveness and pitcher salary.